LSTM-ED for Anomaly Detection in Time Series Data¶

In [ ]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from dataset import *
from plots import *
from metrics import *
from models_funtions import *

# Set style for matplotlib
plt.style.use("Solarize_Light2")

import plotly.io as pio
pio.renderers.default = "notebook_connected"
In [ ]:
# Path to the root directory of the dataset
ROOTDIR_DATASET_NORMAL =  '../dataset/normal'
ROOTDIR_DATASET_ANOMALY = '../dataset/collisions'

# TF_ENABLE_ONEDNN_OPTS=0 means that the model will not use the oneDNN library for optimization

import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

Variours parameters¶

In [ ]:
#freq = '1.0'
#freq = '0.1'
freq = '0.01'
#freq = '0.005'

file_name_normal = "_20220811_rbtc_"
file_name_collisions = "_collision_20220811_rbtc_"

recording_normal = [0, 2, 3, 4]
recording_collisions = [1, 5]

freq_str = freq.replace(".", "_")
features_folder_normal = f"./features/normal{freq_str}/"
features_folder_collisions = f"./features/collisions{freq_str}/"

Data¶

In [ ]:
df_features_normal, df_normal_raw, _ = get_dataframes(ROOTDIR_DATASET_NORMAL, file_name_normal, recording_normal, freq, f"{features_folder_normal}")
df_features_collisions, df_collisions_raw, df_collisions_raw_action = get_dataframes(ROOTDIR_DATASET_ANOMALY, file_name_collisions, recording_collisions, freq, f"{features_folder_collisions}1_5/")
df_features_collisions_1, df_collisions_raw_1, df_collisions_raw_action_1 = get_dataframes(ROOTDIR_DATASET_ANOMALY, file_name_collisions, [1], freq, f"{features_folder_collisions}1/")
df_features_collisions_5, df_collisions_raw_5, df_collisions_raw_action_5 = get_dataframes(ROOTDIR_DATASET_ANOMALY, file_name_collisions, [5], freq, f"{features_folder_collisions}5/")
Loading data.
Found 31 different actions.
Loading data done.

Loading features from file.
--- 0.0402834415435791 seconds ---
Loading data.
Found 31 different actions.
Loading data done.

Loading features from file.
--- 0.022567033767700195 seconds ---
Loading data.
Found 31 different actions.
Loading data done.

Loading features from file.
--- 0.02035999298095703 seconds ---
Loading data.
Found 31 different actions.
Loading data done.

Loading features from file.
--- 0.0225677490234375 seconds ---
In [ ]:
X_train, y_train, X_test, y_test, df_test = get_train_test_data(df_features_normal, df_features_collisions, full_normal=True)
X_train_1, y_train_1, X_test_1, y_test_1, df_test_1 = get_train_test_data(df_features_normal, df_features_collisions_1, full_normal=True)
X_train_5, y_train_5, X_test_5, y_test_5, df_test_5 = get_train_test_data(df_features_normal, df_features_collisions_5, full_normal=True)
c:\Users\VG User\Documents\GitHub\MLinAPP-FP01-14\.venv\Lib\site-packages\sklearn\base.py:493: UserWarning:

X does not have valid feature names, but VarianceThreshold was fitted with feature names

c:\Users\VG User\Documents\GitHub\MLinAPP-FP01-14\.venv\Lib\site-packages\sklearn\base.py:493: UserWarning:

X does not have valid feature names, but VarianceThreshold was fitted with feature names

c:\Users\VG User\Documents\GitHub\MLinAPP-FP01-14\.venv\Lib\site-packages\sklearn\base.py:493: UserWarning:

X does not have valid feature names, but VarianceThreshold was fitted with feature names

Collisions¶

In [ ]:
collisions_rec1, collisions_init1 = get_collisions('1', ROOTDIR_DATASET_ANOMALY)
collisions_rec5, collisions_init5 = get_collisions('5', ROOTDIR_DATASET_ANOMALY)

# Merge the collisions of the two recordings in one dataframe
collisions_rec = pd.concat([collisions_rec1, collisions_rec5])
collisions_init = pd.concat([collisions_init1, collisions_init5])
In [ ]:
collisions_zones, y_collisions = get_collisions_zones_and_labels(collisions_rec, collisions_init, df_features_collisions)
collisions_zones_1, y_collisions_1 = get_collisions_zones_and_labels(collisions_rec1, collisions_init1, df_features_collisions_1)
collisions_zones_5, y_collisions_5 = get_collisions_zones_and_labels(collisions_rec5, collisions_init5, df_features_collisions_5)

DAGMM for Anomaly Detection in Time Series Data¶

In [ ]:
from algorithms.dagmm import DAGMM

classifier = DAGMM(
    num_epochs=10,
    lambda_energy=0.1,
    lambda_cov_diag=0.005,
    lr=1e-4,
    batch_size=32,
    gmm_k=5,
    normal_percentile=80,
    sequence_length=30,
    autoencoder_type=DAGMM.AutoEncoder.LSTM,  # Using LSTM autoencoder
    hidden_size=32,
    autoencoder_args={
        'n_layers': (4, 4),
        'use_bias': (True, True),
        'dropout': (0.1, 0.1)
    },
    seed=42,
    gpu=None,  # Set to None for CPU, or specify GPU index if available
    details=True
)

# Train the DAGMM on normal data
classifier.fit(X_train)
print("DAGMM training completed.")
100%|██████████| 10/10 [00:31<00:00,  3.11s/it]
DAGMM training completed.

Predictions¶

In [ ]:
df_test = get_statistics(X_test, y_collisions, classifier, df_test, freq, threshold_type="mad")
df_test_1 = get_statistics(X_test_1, y_collisions_1, classifier, df_test_1, freq, threshold_type="mad")
df_test_5 = get_statistics(X_test_5, y_collisions_5, classifier, df_test_5, freq, threshold_type="mad")
Anomaly prediction completed.
Number of anomalies detected: 0 with threshold 33.560173678666125, std
Number of anomalies detected: 0 with threshold 32.196242037477596, mad
Number of anomalies detected: 16 with threshold 25.325382750609826, percentile
Number of anomalies detected: 0 with threshold 57.75940595152788, IQR
Number of anomalies detected: 197 with threshold 0.0, zero

choosen threshold type: mad, with value: 32.1962
F1 Score: 0.0000
Accuracy: 0.6569
Precision: 0.0000
Recall: 0.0000
              precision    recall  f1-score   support

           0       0.66      1.00      0.79       201
           1       0.00      0.00      0.00       105

    accuracy                           0.66       306
   macro avg       0.33      0.50      0.40       306
weighted avg       0.43      0.66      0.52       306

ROC AUC Score: 0.5204
c:\Users\VG User\Documents\GitHub\MLinAPP-FP01-14\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning:

Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.

c:\Users\VG User\Documents\GitHub\MLinAPP-FP01-14\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning:

Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

c:\Users\VG User\Documents\GitHub\MLinAPP-FP01-14\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning:

Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

c:\Users\VG User\Documents\GitHub\MLinAPP-FP01-14\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1509: UndefinedMetricWarning:

Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Anomalies detected: 0
Best threshold: -19.5755 | F1 Score: 0.5147 | Precision: 0.3465 | Recall: 1.0000
Anomalies detected with best threshold: 303

	-------------------------------------------------------------------------------------

Anomaly prediction completed.
Number of anomalies detected: 13 with threshold 24.955992948358226, std
Number of anomalies detected: 17 with threshold 16.21054283240262, mad
Number of anomalies detected: 9 with threshold 27.424001065889996, percentile
Number of anomalies detected: 0 with threshold 37.60206255763769, IQR
Number of anomalies detected: 68 with threshold 0.0, zero

choosen threshold type: mad, with value: 16.2105
F1 Score: 0.1154
Accuracy: 0.7195
Precision: 0.1765
Recall: 0.0857
              precision    recall  f1-score   support

           0       0.78      0.89      0.83       129
           1       0.18      0.09      0.12        35

    accuracy                           0.72       164
   macro avg       0.48      0.49      0.47       164
weighted avg       0.65      0.72      0.68       164

ROC AUC Score: 0.5320
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Anomalies detected: 17
Best threshold: -14.4213 | F1 Score: 0.3743 | Precision: 0.2303 | Recall: 1.0000
Anomalies detected with best threshold: 152

	-------------------------------------------------------------------------------------

c:\Users\VG User\Documents\GitHub\MLinAPP-FP01-14\src\models_funtions.py:67: RuntimeWarning:

invalid value encountered in divide

Anomaly prediction completed.
Number of anomalies detected: 3 with threshold 24.83768828719799, std
Number of anomalies detected: 8 with threshold 19.777925820151964, mad
Number of anomalies detected: 8 with threshold 20.503467241923016, percentile
Number of anomalies detected: 3 with threshold 25.312643468379974, IQR
Number of anomalies detected: 141 with threshold 0.0, zero

choosen threshold type: mad, with value: 19.7779
F1 Score: 0.0000
Accuracy: 0.5461
Precision: 0.0000
Recall: 0.0000
              precision    recall  f1-score   support

           0       0.58      0.91      0.71        85
           1       0.00      0.00      0.00        56

    accuracy                           0.55       141
   macro avg       0.29      0.45      0.35       141
weighted avg       0.35      0.55      0.43       141

ROC AUC Score: 0.3624
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Anomalies detected: 8
Best threshold: 0.2194 | F1 Score: 0.5685 | Precision: 0.3972 | Recall: 1.0000
Anomalies detected with best threshold: 141

	-------------------------------------------------------------------------------------

c:\Users\VG User\Documents\GitHub\MLinAPP-FP01-14\src\models_funtions.py:67: RuntimeWarning:

invalid value encountered in divide

In [ ]:
plot_anomalies_true_and_predicted(df_collisions_raw, df_collisions_raw_action, collisions_zones, df_test, title="Collisions zones vs predicted zones for both recordings")
In [ ]:
plot_anomalies_true_and_predicted(df_collisions_raw_1, df_collisions_raw_action_1, collisions_zones_1, df_test_1, title="Collisions zones vs predicted zones for recording 1")
In [ ]:
plot_anomalies_true_and_predicted(df_collisions_raw_5, df_collisions_raw_action_5, collisions_zones_5, df_test_5, title="Collisions zones vs predicted zones for recording 5")